{
 "cells": [
  {
   "cell_type": "code",
   "id": "3e4525da-cc7e-42b2-ad9e-7494df441322",
   "metadata": {
    "tags": []
   },
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from simpletransformers.classification import ClassificationModel, ClassificationArgs\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score\n",
    "import torch\n",
    "\n",
    "cuda_available = torch.cuda.is_available()\n",
    "cuda_available"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "30fa839e-40de-43d7-80b7-c4c784a3c079",
   "metadata": {
    "tags": []
   },
   "source": [
    "#import all bot and facebook responses to ONQ2 (\"discrimination\" question)\n",
    "df_raw = pd.read_excel(r\"01_input data\\onq2.xlsx\")\n",
    "\n",
    "#create new data frame (df) consisting of all 800 bot responses and 800 randomly selected facebook responses\n",
    "df_bot_responses = df_raw[df_raw['label'] == 1]\n",
    "df_facebook_responses = df_raw[df_raw['label'] == 0].sample(n = 800, random_state = 746)\n",
    "df = pd.concat([df_bot_responses, df_facebook_responses]).sample(frac=1, random_state=461).reset_index(drop=True)\n",
    "\n",
    "#export df as excel file\n",
    "#df.to_excel(r\"02_output data\\onq2_balanced_sample.xlsx\", index=False, engine=\"openpyxl\")\n",
    "\n",
    "#print df characteristics\n",
    "print(df.head(5))\n",
    "print(df.shape)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "04f31cc9-cec6-4a24-a449-bf28a00a623a",
   "metadata": {
    "tags": []
   },
   "source": [
    "#define hyperparameters\n",
    "train_args = ClassificationArgs(\n",
    "    use_early_stopping=True,\n",
    "    early_stopping_delta=0.01,\n",
    "    early_stopping_metric=\"f1\",\n",
    "    early_stopping_metric_minimize=False,\n",
    "    early_stopping_patience=5,\n",
    "    evaluate_during_training_steps=1000,\n",
    "    manual_seed=4,\n",
    "    save_steps=-1,\n",
    "    save_model_every_epoch=False,\n",
    "    overwrite_output_dir=True,\n",
    "    use_multiprocessing=True\n",
    ")\n",
    "\n",
    "#define training epochs and learning rates\n",
    "p_epochs= [5, 10, 15]\n",
    "p_lr = [1e-3, 1e-4, 1e-5]\n",
    "\n",
    "#split responses for training (60%), validation (20%), and testing (20%)\n",
    "train_df, temp_df = train_test_split(df, test_size=0.4, random_state=93)\n",
    "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=216)\n",
    "\n",
    "#initialize variables to store optimal hyperparameters\n",
    "best_lr = None\n",
    "best_epochs = None\n",
    "best_val_metric = 0.0  #initialize with a low value\n",
    "\n",
    "#iterate through combinations of training epochs and learning rates\n",
    "for epochs in p_epochs:\n",
    "    for lr in p_lr:\n",
    "        train_args.learning_rate = lr\n",
    "        train_args.num_train_epochs = epochs\n",
    "\n",
    "        #create a classification model\n",
    "        model = ClassificationModel(\n",
    "        \"bert\", \"bert-base-german-cased\",\n",
    "        use_cuda=cuda_available,\n",
    "        args=train_args\n",
    "        )\n",
    "\n",
    "        #fine-tune the model on the training set\n",
    "        model.train_model(train_df)\n",
    "\n",
    "        #evaluate the model on the validation set\n",
    "        val_result, val_model_outputs, val_wrong_predictions = model.eval_model(\n",
    "            val_df, \n",
    "            acc=accuracy_score, \n",
    "            f1=f1_score,\n",
    "            precision=precision_score,\n",
    "            recall=recall_score,\n",
    "            auc=roc_auc_score)\n",
    "\n",
    "        if val_result[\"f1\"] > best_val_metric:\n",
    "            best_val_metric = val_result[\"f1\"]\n",
    "            best_lr = lr\n",
    "            best_epochs = epochs"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "0470e663-0aa5-413b-905c-75a328d933a3",
   "metadata": {
    "tags": []
   },
   "source": [
    "#fine-tune a new model with the best hyperparameters on the training set\n",
    "train_args.learning_rate = best_lr\n",
    "train_args.num_train_epochs = best_epochs\n",
    "model = ClassificationModel(\n",
    "    \"bert\", \"bert-base-german-cased\",\n",
    "    use_cuda=cuda_available,\n",
    "    args=train_args\n",
    "    )\n",
    "model.train_model(train_df)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8233436d-514c-4b45-82d0-2de436d4e5d8",
   "metadata": {
    "tags": []
   },
   "source": [
    "#make predictions on the test set\n",
    "test_predictions, _ = model.predict(test_df['text'].tolist())\n",
    "\n",
    "#calculate precision, recall, and F1 score\n",
    "precision = precision_score(test_df['label'], test_predictions)\n",
    "recall = recall_score(test_df['label'], test_predictions)\n",
    "f1 = f1_score(test_df['label'], test_predictions)\n",
    "\n",
    "print(f\"Precision: {precision}\")\n",
    "print(f\"Recall: {recall}\")\n",
    "print(f\"F1 Score: {f1}\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e77fc9fd-97a4-4afb-ada5-0fc520634ef3",
   "metadata": {},
   "source": [
    "#export df with predictions as excel file\n",
    "results_df = pd.DataFrame({\n",
    "    \"Text\": test_df['text'].tolist(),\n",
    "    \"ID\": test_df['ID'].tolist(),\n",
    "    \"True Label\": test_df['label'].tolist(),\n",
    "    \"Predicted Label\": test_predictions\n",
    "})\n",
    "\n",
    "results_df.to_excel(r\"02_output data\\onq2_onq2m_predictions.xlsx\", index=False, engine=\"openpyxl\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "#save fine-tuned model for cross-corpus predictions and token analysis\n",
    "model.model.save_pretrained(\"02_output data\\models\\onq2m\")\n",
    "model.tokenizer.save_pretrained(\"02_output data\\models\\onq2m\")"
   ],
   "id": "9b96aaa640073320",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
